其他
一文告诉你,如何使用Python构建一个“谷歌搜索”系统 | 内附代码
pip install google-cloud-vision
import os, io
from google.cloud import vision
from google.cloud.vision import types
# JSON file that contains your key
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'your_private_key.json'
# Instantiates a client
client = vision.ImageAnnotatorClient()
FILE_NAME = 'your_image_file.jpg'
# Loads the image into memory
with io.open(os.path.join(FILE_NAME), 'rb') as image_file:
content = image_file.read()
image = vision.types.Image(content=content)
# Performs text detection on the image file
response = client.text_detection(image=image)
print(response)
# Extract description
texts = response.text_annotations[0]
print(texts.description)
import re
import urllib
# If ending with question mark
if '?' in texts.description:
question = re.search('([^?]+)', texts.description).group(1)
# If ending with colon
elif ':' in texts.description:
question = re.search('([^:]+)', texts.description).group(1)
# If ending with newline
elif '\n' in texts.description:
question = re.search('([^\n]+)', texts.description).group(1)
# Slugify the match
slugify_keyword = urllib.parse.quote_plus(question)
print(slugify_keyword)
/url?q=https://en.wikipedia.org/wiki/IAU_definition_of_planet&sa=U&ved=2ahUKEwiSmtrEsaTnAhXtwsQBHduCCO4QFjAAegQIBBAB&usg=AOvVaw0HzMKrBxdHZj5u1Yq1t0en
result_urls = []
def crawl_result_urls():
req = Request('https://google.com/search?q=' + slugify_keyword, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(req).read()
bs = BeautifulSoup(html, 'html.parser')
results = bs.find_all('div', class_='ZINbbc')
try:
for result in results:
link = result.find('a')['href']
# Checking if it is url (in case)
if 'url' in link:
result_urls.append(re.search('q=(.*)&sa', link).group(1))
except (AttributeError, IndexError) as e:
pass
pip install cdqa
import pandas as pd
from ast import literal_eval
from cdqa.utils.filters import filter_paragraphs
from cdqa.utils.download import download_model, download_bnpp_data
from cdqa.pipeline.cdqa_sklearn import QAPipeline
# Download data and models
download_bnpp_data(dir='./data/bnpp_newsroom_v1.1/')
download_model(model='bert-squad_1.1', dir='./models')
# Loading data and filtering / preprocessing the documents
df = pd.read_csv('data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv', converters={'paragraphs': literal_eval})
df = filter_paragraphs(df)
# Loading QAPipeline with CPU version of BERT Reader pretrained on SQuAD 1.1
cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib')
# Fitting the retriever to the list of documents in the dataframe
cdqa_pipeline.fit_retriever(df)
# Sending a question to the pipeline and getting prediction
query = 'Since when does the Excellence Program of BNP Paribas exist?'
prediction = cdqa_pipeline.predict(query)
print('query: {}\n'.format(query))
print('answer: {}\n'.format(prediction[0]))
print('title: {}\n'.format(prediction[1]))
print('paragraph: {}\n'.format(prediction[2]))
def get_result_details(url):
try:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(req).read()
bs = BeautifulSoup(html, 'html.parser')
try:
# Crawl any heading in result to name pdf file
title = bs.find(re.compile('^h[1-6]$')).get_text().strip().replace('?', '').lower()
# Naming the pdf file
filename = "/home/coderasha/autoans/pdfs/" + title + ".pdf"
if not os.path.exists(os.path.dirname(filename)):
try:
os.makedirs(os.path.dirname(filename))
except OSError as exc: # Guard against race condition
if exc.errno != errno.EEXIST:
raise
with open(filename, 'w') as f:
# Crawl first 5 paragraphs
for line in bs.find_all('p')[:5]:
f.write(line.text + '\n')
except AttributeError:
pass
except urllib.error.HTTPError:
pass
def find_answer():
df = pdf_converter(directory_path='/home/coderasha/autoans/pdfs')
cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib')
cdqa_pipeline.fit_retriever(df)
query = question + '?'
prediction = cdqa_pipeline.predict(query)
print('query: {}\n'.format(query))
print('answer: {}\n'.format(prediction[0]))
print('title: {}\n'.format(prediction[1]))
print('paragraph: {}\n'.format(prediction[2]))
return prediction[0]
import os, io
import errno
import urllib
import urllib.request
import hashlib
import re
import requests
from time import sleep
from google.cloud import vision
from google.cloud.vision import types
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import pandas as pd
from ast import literal_eval
from cdqa.utils.filters import filter_paragraphs
from cdqa.utils.download import download_model, download_bnpp_data
from cdqa.pipeline.cdqa_sklearn import QAPipeline
from cdqa.utils.converters import pdf_converter
result_urls = []
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'your_private_key.json'
client = vision.ImageAnnotatorClient()
FILE_NAME = 'your_image_file.jpg'
with io.open(os.path.join(FILE_NAME), 'rb') as image_file:
content = image_file.read()
image = vision.types.Image(content=content)
response = client.text_detection(image=image)
texts = response.text_annotations[0]
# print(texts.description)
if '?' in texts.description:
question = re.search('([^?]+)', texts.description).group(1)
elif ':' in texts.description:
question = re.search('([^:]+)', texts.description).group(1)
elif '\n' in texts.description:
question = re.search('([^\n]+)', texts.description).group(1)
slugify_keyword = urllib.parse.quote_plus(question)
# print(slugify_keyword)
def crawl_result_urls():
req = Request('https://google.com/search?q=' + slugify_keyword, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(req).read()
bs = BeautifulSoup(html, 'html.parser')
results = bs.find_all('div', class_='ZINbbc')
try:
for result in results:
link = result.find('a')['href']
print(link)
if 'url' in link:
result_urls.append(re.search('q=(.*)&sa', link).group(1))
except (AttributeError, IndexError) as e:
pass
def get_result_details(url):
try:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(req).read()
bs = BeautifulSoup(html, 'html.parser')
try:
title = bs.find(re.compile('^h[1-6]$')).get_text().strip().replace('?', '').lower()
# Set your path to pdf directory
filename = "/path/to/pdf_folder/" + title + ".pdf"
if not os.path.exists(os.path.dirname(filename)):
try:
os.makedirs(os.path.dirname(filename))
except OSError as exc:
if exc.errno != errno.EEXIST:
raise
with open(filename, 'w') as f:
for line in bs.find_all('p')[:5]:
f.write(line.text + '\n')
except AttributeError:
pass
except urllib.error.HTTPError:
pass
def find_answer():
# Set your path to pdf directory
df = pdf_converter(directory_path='/path/to/pdf_folder/')
cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib')
cdqa_pipeline.fit_retriever(df)
query = question + '?'
prediction = cdqa_pipeline.predict(query)
# print('query: {}\n'.format(query))
# print('answer: {}\n'.format(prediction[0]))
# print('title: {}\n'.format(prediction[1]))
# print('paragraph: {}\n'.format(prediction[2]))
return prediction[0]
crawl_result_urls()
for url in result_urls[:3]:
get_result_details(url)
sleep(5)
answer = find_answer()
print('Answer: ' + answer)
◆
精彩推荐
◆
你点的每个“在看”,我都认真当成了AI